# create a files directory if it does not exist
if (!dir.exists('step0_extract_and_eda_files')) {dir.create('step0_extract_and_eda_files')}
# Check and install if these packages are not found locally on machine
if(require(pacman)==FALSE) install.packages("pacman")
if(require(devtools)==FALSE) install.packages("devtools")
if(require(urbnmapr)==FALSE) devtools::install_github('UrbanInstitute/urbnmapr')
if(require(albersusa)==FALSE) devtools::install_github("hrbrmstr/albersusa")
pacman::p_load(tidyverse, magrittr, janitor, lubridate, hms, skimr, # data analysis
fontawesome, rsvg, # for fontawesome icons
pander, knitr, # for nicely printed outputs
scales, plotly, # for plots
urbnmapr, tmap, sf, leaflet, albersusa, tigris, # for maps
gifski, av) # for creating gif and videosCOVID-19 Vaccines by County
Step 0: Extracting the Data and Exploratory Data Analysis
1 Objectives of this Document
The main objectives of this document are as follows:
- Provide a scripted and reproducible document that explains how we extracted the covid vaccination data by county and potential predictors that can explain the variability within this data.
- Present our exploratory data analysis for this data
2 R Setup and Required Packages
In this project, the open-source programming language is used to model the COVID-19 percent fully-vaccinated progression in different U.S. counties. is maintained by an international team of developers who make the language available at The Comprehensive R Archive Network. Readers interested in reusing our code and reproducing our results should have installed locally on their machines. can be installed on a number of different operating systems (see Windows, Mac, and Linux for the installation instructions for these systems). We also recommend using the RStudio interface for . The reader can download RStudio for free by following the instructions at the link. For non-R users, we recommend the Hands-on Programming with R for a brief overview of the software’s functionality. Hereafter, we assume that the reader has an introductory understanding of the programming language.
In the code chunk below, we load the packages used to support our analysis. Our input and output files can also be accessed/ downloaded from fmegahed/vaccines_spatial_and_optimization.
3 Data Extraction
3.1 Extracting the Vaccines Dataset
The CDC has made available a dataset for COVID-19 Vaccinations in the United States by County. Per the link’s description, the dataset captures:
Overall US COVID-19 Vaccine administration data at county level. Data represents all vaccine partners including jurisdictional partner clinics, retail pharmacies, long-term care facilities, dialysis centers, Federal Emergency Management Agency and Health Resources and Services Administration partner sites, and federal entity facilities.
We extracted the following data using the code in the chunk below. The descriptions for each variable are available at CDC’s Data Dictionary, and reproduced below for the reader’s convenience.
-
date:date in year-month-date format.
-
fips:The United States’ Federal Information Processing Standards provide a 5-digit numeric code for geographical locations (where the first two digits are reserved to denote the state number and the following three digits are used to codify the county number within the state). The numbers are in alphabetic order, where the first state01 = Alabama. -
mmwr_week:The week of the epidemiologic year as defined by the Morbidity and Mortality Weekly Report.
-
recip_county:The recipient county
-
recip_state:The recipient state
-
series_complete_pop_pct:Percent of people who are fully vaccinated (have second dose of a two-dose vaccine or one dose of a single-dose vaccine) based on the jurisdiction and county where vaccine recipient lives
-
booster_doses_vax_pct:Percent of people who are fully vaccinated and have received a booster (or additional) dose.
-
svit_ctgy: The CDC Social Vulnerability Index (SVI) rank categorization where:-
A= 0–0.25 SVI rank
-
B= 0.2501–0.50 SVI rank
-
C= 0.5001–0.75 SVI rank
-
D= 0.7501–1.0 SVI rank
-
-
metro_status:Metro vs. non-metro classification type is an aggregation of the six National Center for Health Statistics (NCHS) Urban-Rural Classification Scheme for Counties.-
Metrocounties include Large Central Metropolitan, Large Fringe Metropolitan, Medium Metropolitan, and Small Metropolitan classifications.
-
Non-Metrocounties include Micropolitan and Non-Core (Rural) classifications.
-
-
census2019:2019 Census Population
In addition, we have capitalized on the lubridate package to extract two numeric variables: month and day_of_month since we will aggregate the data by month. Initially, we explored two possible days of the month for aggregating the vaccine data:
+ 1st of month, which is the decision we decided to stick with.
+ 15th of month, which captured the percent vaccinated at the middle of the month. We have decided against using this option due to data reporting inconsistencies in Texas (see time_series_plot_response_15th_of_month.png for an example).
csvLink = "https://data.cdc.gov/api/views/8xkx-amqh/rows.csv?accessType=DOWNLOAD"
# reading the data from the CDC site
raw_vaccines_tbl = read_csv(csvLink) %>%
janitor::clean_names() # all lower case
# saving the time of extract for record keeping
vac_csv_extract_time = Sys.time()
# saving the raw_vaccines_tbl
write_rds(x = raw_vaccines_tbl, file = 'data/raw_vaccines_tbl.rds', compress = 'gz')
# subsetting the read data
vaccines = raw_vaccines_tbl %>%
# Removing unknown (UNK) and territories/states outside of continental US
filter(!recip_state %in% c('AK', 'AS', 'FM', 'GU', 'HI',
'MH', 'MP', 'PR', 'PW', 'UNK', 'VI')) %>%
# Removing data with unknown FIPS code
# since it will result in an unknown county in our data
filter(fips != 'UNK') %>%
# creating new variables
mutate(
# converting the date variable to date
date = mdy(date),
# extracting the day of the month
day_of_month = day(date),
# extracting the month
month = month(date)
) %>%
# reducing data to monthly
dplyr::filter(day_of_month == 1) %>%
# arranging the date by fips code and then by date
arrange(fips, date)
# need to select variables and fixing variable types
vaccines =
vaccines %>%
# selecting variables of interest from the CDC's dataset
select(
date:recip_state,
series_complete_pop_pct,
booster_doses_vax_pct,
svi_ctgy,
metro_status,
census2019
) %>%
# converting select character variables to factor
mutate(
recip_state = as.factor(recip_state),
svi_ctgy = as.factor(svi_ctgy),
metro_status = as.factor(metro_status)
)
# Rounding the time of data extraction to the nearest minute
paste0('The vaccines data were extracted on ',
format(vac_csv_extract_time, '%B %d, %Y'),
' at approximately ',
round_hms(as_hms(vac_csv_extract_time), digits = -2),
format(vac_csv_extract_time, ' %Z'), '.')[1] "The vaccines data were extracted on August 17, 2022 at approximately 00:14:00 EDT."
3.2 Extracting Additional Potential Covariates
While the CDC provides two potential covariates (svi_ctgy and metro_status) that can explain changes in our response variable (series_complete_pop_pct), we have also scraped information for the following predictors:
-
perc_rep_votes, which captures the percent of the total county votes received by the Republican Presidential candidate in the 2020 elections. This variable is computed from the MEDSL Election Returns Data Verse V10.0. We downloaded thecountypres_2000-2020.csvon August 16, 2022 at 09:56:51 EDT
# reading the downloaded CSV
elections = read_csv("data/countypres_2000-2020.csv") %>%
# converting column names to lowercase
janitor::clean_names() %>%
# just keeping data for the most recent presidential election and republican votes
filter(year == 2020 & party == "REPUBLICAN") %>%
# computing percent of republican votes (from total votes)
mutate(perc_rep_votes = 100*(candidatevotes/totalvotes) ) %>%
# keeping only the key and variable used in merge
select(county_fips, perc_rep_votes) %>%
# summing all different types of votes for republican candidate
# since some states break down their votes (e.g., AR, GA, IA, NC, etc) into
# (i.e., prov + absentee + election day + advanced voting)
group_by(county_fips) %>%
summarise(perc_rep_votes = sum(perc_rep_votes, na.rm = T)) %>%
ungroup()
# joining the elections data
vaccines =
left_join(x = vaccines, y = elections,
by = c('fips' = 'county_fips'),
keep = F)
# saving the vaccines dataset
write_rds(x = vaccines, file = 'data/vaccines.rds')4 Exploratory Data Analysis
4.1 Meta Data Summary
skim(vaccines)| Name | vaccines |
| Number of rows | 55944 |
| Number of columns | 11 |
| _______________________ | |
| Column type frequency: | |
| character | 2 |
| Date | 1 |
| factor | 3 |
| numeric | 5 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| fips | 0 | 1 | 5 | 5 | 0 | 3108 | 0 |
| recip_county | 0 | 1 | 10 | 27 | 0 | 1843 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| date | 0 | 1 | 2021-01-01 | 2022-06-01 | 2021-09-16 | 18 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| recip_state | 0 | 1 | FALSE | 49 | TX: 4572, GA: 2862, VA: 2394, KY: 2160 |
| svi_ctgy | 18 | 1 | FALSE | 4 | A: 14094, D: 13986, C: 13968, B: 13878 |
| metro_status | 0 | 1 | FALSE | 2 | Non: 35064, Met: 20880 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| mmwr_week | 0 | 1.00 | 22.72 | 15.40 | 0.00 | 9.00 | 20.00 | 35.00 | 53.00 | ▇▆▆▃▅ |
| series_complete_pop_pct | 78 | 1.00 | 33.24 | 21.12 | 0.00 | 15.10 | 37.00 | 48.70 | 99.90 | ▆▆▇▂▁ |
| booster_doses_vax_pct | 37338 | 0.33 | 41.49 | 10.93 | 0.00 | 35.30 | 41.80 | 48.80 | 95.00 | ▁▅▇▁▁ |
| census2019 | 0 | 1.00 | 104920.24 | 334717.87 | 169.00 | 11130.25 | 26113.00 | 68151.00 | 10039107.00 | ▇▁▁▁▁ |
| perc_rep_votes | 36 | 1.00 | 65.06 | 16.04 | 8.73 | 55.85 | 68.34 | 77.51 | 96.18 | ▁▂▅▇▃ |
4.2 Response Variable: series_complete_pop_pct
4.2.1 Spatiotemporal Visualization
# Getting the counties map from the urbnmapr package and excluding non-continental US
counties_sf = get_urbn_map(map = "counties", sf = TRUE) %>%
filter(!state_name %in% c('Alaska', 'Hawaii') )
# Getting the states map from the urbnmapr package and excluding non-continental US
states_sf = get_urbn_map(map = "states", sf = TRUE) %>%
filter(!state_name %in% c('Alaska', 'Hawaii') )
# Discretizing the response variable to facilitate the visualization
vaccines =
vaccines %>%
mutate(series_complete_pop_pct_disc =
cut(series_complete_pop_pct,
breaks = c(0, 29.9, 39.9, 49.9, 69.9, 100),
labels = c('0-29.9%', '30-39.9%',
'40-49.9%', '50-69.9%', '70%+') ) )
# Left joining counties_sf with the response variable
cty_sf_vac = left_join(
counties_sf,
vaccines %>% select(date, fips, series_complete_pop_pct, series_complete_pop_pct_disc),
by = c("county_fips" = "fips")
)
# Adjusting the bounding boxes for the map so that legend and credits print nicely
# Solution based on https://stackoverflow.com/a/60899644/10156153
bbox_new = st_bbox(counties_sf)
xrange = bbox_new$xmax - bbox_new$xmin # range of x values
yrange = bbox_new$ymax - bbox_new$ymin # range of y values
bbox_new[1] = bbox_new[1] - (0.1 * xrange) # xmin - left
bbox_new[3] = bbox_new[3] + (0.12 * xrange) # xmax - right
bbox_new[2] = bbox_new[2] - (0.1 * yrange) # ymin - bottom
bbox_new = bbox_new %>% # take the bounding box ...
st_as_sfc() # ... and make it a sf polygon
# Create an animated map based on the tmap package
animated_map = tm_shape(cty_sf_vac, bbox = bbox_new) +
tm_borders(col = "gray80", lwd = 0.15) +
tm_fill('series_complete_pop_pct_disc', palette = "YlGnBu", colorNA = "gray50",
title = "% of County's Total Population Fully Vaccinated") +
tm_facets(along = "date", free.coords = FALSE) +
tm_shape(states_sf) +
tm_borders(col = "black", lwd = 0.5) +
tm_credits("Data Source: The CDC's COVID-19 Vaccinations in the US by County Dataset. Created by: Fadel M. Megahed \t")
tmap_animation(animated_map, filename = "figs/animated_vaccine_map.gif", fps = 0.5,
width = 1000, height = 700, outer.margins = 0)
tmap_animation(animated_map, filename = "figs/animated_vaccine_map.mp4", fps = 0.5,
width = 1000, height = 700, outer.margins = 0)4.2.2 Time Series Plots
In the code chunk below, we create a time-series plot of the series_complete_pop_pct for a sample of six counties. Note how, the timeseries tend to follow a logistic growth curve. However, the data shows some irregularities in counties in TX and in GA. For viewers interested in viewing the time-series for additional counties, we invite you to interact with our web app hosted at http://rstudio.fsb.miamioh.edu:3838/megahefm/vaccines/.
# a non-random sampling: six select counties
# selected to include counties from the four different SVI categories
# as well as the two urban classifications
sample_fips = c('04017', '13057', '36061', '39017', '48081', '54009' )
# creating the plot
vaccines %>%
filter(fips %in% sample_fips) %>%
mutate(county_name = paste(recip_county, recip_state, sep = ', ') ) %>%
ggplot(aes(x = date, y = series_complete_pop_pct,
group = county_name, color = svi_ctgy,
shape = metro_status) ) +
geom_line() +
geom_point() +
scale_y_continuous(name = '% Fully Vaccinated',
breaks = scales::pretty_breaks(n=5), limits = c(0,100) ) +
scale_x_date(name = 'Date', breaks = scales::pretty_breaks(n = 4)) +
scale_color_brewer(palette = 'Dark2') +
facet_wrap(~ county_name, ncol = 2) +
theme_bw(base_size = 11) +
theme(
legend.position = 'bottom',
plot.margin = margin(t = 0.1, r = 0.35, b = 0.1, l = 0.1, unit = 'cm')) ->
plot_sample_y_ts
# saving the static figure
ggsave(filename = 'figs/time_series_plot_response.png',
plot = plot_sample_y_ts,
width=10, height=6)
# showing an interactive plotly figure
ggplotly(plot_sample_y_ts, height = 500)4.3 Potential Predictors
4.3.1 SVI Category
# creating an interactive plot for the HTML output
#### Creating a longlat projection (required by leaflet)
leaflet_sf = counties_sf(proj = 'longlat') %>% # from albersua
filter(!state %in% c('Alaska', 'Hawaii'))
leaflet_sf =
leaflet_sf %>%
geo_join(vaccines %>%
select(fips, svi_ctgy, metro_status, census2019, perc_rep_votes) %>%
unique(),
by_sp= 'fips', by_df= 'fips') %>%
mutate(perc_rep_votes_disc =
cut(perc_rep_votes,
breaks = c(0, 19.99, 39.99, 49.9, 59.9, 79.9, 100),
labels = c('0-19.90%', '20-39.9%', '40-49.9%',
'50-59.9%', '60-99.9%', '80%+') ) )
#### Setting the Color Scheme
leaflet_pal = colorFactor('YlOrRd', domain = leaflet_sf$svi_ctgy, na.color = "white")
#### The interactive visual
leaflet() %>% # initializing the leaflet map
setView(lng = -96, lat = 37.8, zoom = 4) %>% # setting the view on Continental US
addTiles() %>% # adding the default tiles
addPolygons(
data = leaflet_sf, stroke = FALSE, fillColor = ~leaflet_pal(leaflet_sf$svi_ctgy), # adding the data
weight = 2, opacity = 1, color = "white", dashArray = "3", fillOpacity = 0.7, # adding color specs
popup = paste(
"County:", leaflet_sf$name, '<br>',
"SVI Cat:", leaflet_sf$svi_ctgy, '<br>',
"Metro Status:", leaflet_sf$metro_status, '<br>',
'% Rep Votes:', round(leaflet_sf$perc_rep_votes, 1), '<br>',
"Population:", scales::comma(round(leaflet_sf$census2019, 1)), '<br>')
) %>% #pop-up Menu
addLegend(position = "bottomleft", pal = leaflet_pal, values = leaflet_sf$svi_ctgy,
title = "SVI Category", opacity = 1) # legend formatting4.3.2 Metro Status
#### Setting the Color Scheme
leaflet_pal = colorFactor('Dark2', domain = leaflet_sf$metro_status, na.color = "white")
#### The interactive visual
leaflet() %>% # initializing the leaflet map
setView(lng = -96, lat = 37.8, zoom = 4) %>% # setting the view on Continental US
addTiles() %>% # adding the default tiles
addPolygons(
data = leaflet_sf, stroke = FALSE, fillColor = ~leaflet_pal(leaflet_sf$metro_status), # adding the data
weight = 2, opacity = 1, color = "white", dashArray = "3", fillOpacity = 0.7, # adding color specs
popup = paste(
"County:", leaflet_sf$name, '<br>',
"SVI Cat:", leaflet_sf$svi_ctgy, '<br>',
"Metro Status:", leaflet_sf$metro_status, '<br>',
'% Rep Votes:', round(leaflet_sf$perc_rep_votes, 1), '<br>',
"Population:", scales::comma(round(leaflet_sf$census2019, 1)), '<br>')
) %>% #pop-up Menu
addLegend(position = "bottomleft", pal = leaflet_pal, values = leaflet_sf$metro_status,
title = "Metro Status", opacity = 1) # legend formatting4.3.3 Percent Republican Votes
#### Setting the Color Scheme
leaflet_pal = colorNumeric('RdYlBu', domain = leaflet_sf$perc_rep_votes, na.color = "white", reverse = T)
#### The interactive visual
leaflet() %>% # initializing the leaflet map
setView(lng = -96, lat = 37.8, zoom = 4) %>% # setting the view on Continental US
addTiles() %>% # adding the default tiles
addPolygons(
data = leaflet_sf, stroke = FALSE, fillColor = ~leaflet_pal(leaflet_sf$perc_rep_votes), # adding the data
weight = 2, opacity = 1, color = "white", dashArray = "3", fillOpacity = 0.7, # adding color specs
popup = paste(
"County:", leaflet_sf$name, '<br>',
"SVI Cat:", leaflet_sf$svi_ctgy, '<br>',
"Metro Status:", leaflet_sf$metro_status, '<br>',
'% Rep Votes:', round(leaflet_sf$perc_rep_votes, 1), '<br>',
"Population:", scales::comma(round(leaflet_sf$census2019, 1)), '<br>')
) %>% #pop-up Menu
addLegend(position = "bottomleft", pal = leaflet_pal, values = leaflet_sf$perc_rep_votes,
title = "% Rep Votes", opacity = 1) # legend formatting5 Appendix
In this appendix, we print all the packages used in our analysis and their versions to assist with reproducing our results/analysis.
pander(sessionInfo(), compact = TRUE) # printing the session infoR version 4.2.1 (2022-06-23 ucrt)
Platform: x86_64-w64-mingw32/x64 (64-bit)
locale: LC_COLLATE=English_United States.utf8, LC_CTYPE=English_United States.utf8, LC_MONETARY=English_United States.utf8, LC_NUMERIC=C and LC_TIME=English_United States.utf8
attached base packages: stats, graphics, grDevices, utils, datasets, methods and base
other attached packages: av(v.0.7.0), gifski(v.1.6.6-1), tigris(v.1.6.1), leaflet(v.2.1.1), sf(v.1.0-8), tmap(v.3.3-3), plotly(v.4.10.0), scales(v.1.2.0), knitr(v.1.39), pander(v.0.6.5), rsvg(v.2.3.1), fontawesome(v.0.3.0), skimr(v.2.1.4), hms(v.1.1.1), lubridate(v.1.8.0), janitor(v.2.1.0), magrittr(v.2.0.3), forcats(v.0.5.1), stringr(v.1.4.0), dplyr(v.1.0.9), purrr(v.0.3.4), readr(v.2.1.2), tidyr(v.1.2.0), tibble(v.3.1.8), tidyverse(v.1.3.2), albersusa(v.0.4.1), urbnmapr(v.0.0.0.9002), devtools(v.2.4.4), usethis(v.2.1.6), pacman(v.0.5.1), RColorBrewer(v.1.1-3) and ggplot2(v.3.3.6)
loaded via a namespace (and not attached): readxl(v.1.4.0), uuid(v.1.1-0), backports(v.1.4.1), systemfonts(v.1.0.4), lwgeom(v.0.2-8), repr(v.1.1.4), lazyeval(v.0.2.2), sp(v.1.5-0), crosstalk(v.1.2.0), digest(v.0.6.29), htmltools(v.0.5.3), leaflet.providers(v.1.9.0), fansi(v.1.0.3), memoise(v.2.0.1), googlesheets4(v.1.0.0), tzdb(v.0.3.0), remotes(v.2.4.2), modelr(v.0.1.8), vroom(v.1.5.7), prettyunits(v.1.1.1), colorspace(v.2.0-3), rvest(v.1.0.2), rappdirs(v.0.3.3), textshaping(v.0.3.6), haven(v.2.5.0), xfun(v.0.32), rgdal(v.1.5-32), leafem(v.0.2.0), callr(v.3.7.1), crayon(v.1.5.1), jsonlite(v.1.8.0), glue(v.1.6.2), stars(v.0.5-6), gtable(v.0.3.0), gargle(v.1.2.0), pkgbuild(v.1.3.1), abind(v.1.4-5), DBI(v.1.1.3), miniUI(v.0.1.1.1), Rcpp(v.1.0.9), viridisLite(v.0.4.0), xtable(v.1.8-4), units(v.0.8-0), bit(v.4.0.4), foreign(v.0.8-82), proxy(v.0.4-27), profvis(v.0.3.7), htmlwidgets(v.1.5.4), httr(v.1.4.3), ellipsis(v.0.3.2), farver(v.2.1.1), urlchecker(v.1.0.1), pkgconfig(v.2.0.3), XML(v.3.99-0.10), dbplyr(v.2.2.1), utf8(v.1.2.2), tidyselect(v.1.1.2), rlang(v.1.0.3), later(v.1.3.0), tmaptools(v.3.1-1), munsell(v.0.5.0), cellranger(v.1.1.0), tools(v.4.2.1), cachem(v.1.0.6), cli(v.3.3.0), generics(v.0.1.3), broom(v.1.0.0), evaluate(v.0.16), fastmap(v.1.1.0), ragg(v.1.2.2), yaml(v.2.3.5), bit64(v.4.0.5), processx(v.3.7.0), leafsync(v.0.1.0), fs(v.1.5.2), mime(v.0.12), xml2(v.1.3.3), compiler(v.4.2.1), rstudioapi(v.0.13), curl(v.4.3.2), png(v.0.1-7), e1071(v.1.7-11), reprex(v.2.0.1), stringi(v.1.7.8), highr(v.0.9), ps(v.1.7.1), rgeos(v.0.5-9), lattice(v.0.20-45), classInt(v.0.4-7), vctrs(v.0.4.1), pillar(v.1.8.0), lifecycle(v.1.0.1), data.table(v.1.14.2), maptools(v.1.1-4), raster(v.3.5-21), httpuv(v.1.6.5), R6(v.2.5.1), promises(v.1.2.0.1), KernSmooth(v.2.23-20), sessioninfo(v.1.2.2), codetools(v.0.2-18), dichromat(v.2.0-0.1), assertthat(v.0.2.1), pkgload(v.1.3.0), withr(v.2.5.0), parallel(v.4.2.1), terra(v.1.6-7), grid(v.4.2.1), class(v.7.3-20), rmarkdown(v.2.14), snakecase(v.0.11.0), googledrive(v.2.0.0), shiny(v.1.7.2) and base64enc(v.0.1-3)